Load libraries

library(msa)
library(ape)
library(seqinr)
library(ggtree)
library(tidyverse)

List all dereplicate fasta files

directory <- "./derep_fasta"
files <- list.files(path=directory, pattern=".fasta")

for (i in files){
        print(paste0(directory, "/", i))
}
## [1] "./derep_fasta/derep_anisakidae_18s.fasta"
## [1] "./derep_fasta/derep_anisakidae_coi.fasta"
## [1] "./derep_fasta/derep_anisakidae_coi5p.fasta"
## [1] "./derep_fasta/derep_anisakidae_coii.fasta"
## [1] "./derep_fasta/derep_anisakidae_its.fasta"

Dereplicate anisakidae 18S

Dereplicate anisakidae COX1

# read in fasta file
fcontent <- readLines("dedup_fasta/derep_anisakidae_coi5p.fasta")

seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))

mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)

for (i in 1:nrow(mSeqPos)){
        sequ <- ""
        for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
                sequ <- paste0(sequ, fcontent[j])
        }
        mSeqPos[i, "sequence"] <- sequ
}

seqDb <- mSeqPos %>% 
        as.data.frame() %>% 
        rownames_to_column('id') %>% 
        filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
        mutate(id=gsub('cf. ', '', id)) %>%
        mutate(id=gsub('aff. ', '', id)) %>% 
        mutate(id=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>% 
        mutate(genus=word(id, 2)) %>% 
        mutate(species=paste(word(id, 2), word(id,3))) %>%
        select(id, genus, species, sequence) %>% 
        column_to_rownames('id')

# Step 1: take the DNA sequence
DNASEQ <- seqDb[, 'sequence']
names(DNASEQ) <- rownames(seqDb)
# Step 2: convert to DNAStringSet
DNASS <- DNAStringSet(DNASEQ)
# or directly read in fasta using readDNAStringSet
# DNASS <- readDNAStringSet("fasta/subseq_anisakidae_18s.fasta")
# DNASS <- readDNAStringSet("fasta/subseq_anisakidae_coi5p.fasta")
# DNASS <- readDNAStringSet("dedup_fasta/derep_anisakidae_its.fasta")

# Step 3: multiple sequence alignment
alignment <- msa(DNASS, 'ClustalOmega')
# Step 4: clustering and distances
alignment <- msaConvert(alignment, "seqinr::alignment")
distMatrix <- dist.alignment(alignment, 'similarity')
clustering <- hclust(distMatrix)
# Step 5: APE to make cladogram
phylotree <- as.phylo(clustering)
# plot(phylotree, type="cladogram")

ggtree(phylotree) +
        geom_tiplab(size=2.5) +
        ggexpand(ratio=1.2, side='h')

Dereplicate anisakidae COX2

(same as abovementioned)

Dereplicate anisakidae ITS

(same as abovementioned)

Dereplicate anisakidae 18S

# read in fasta file
fcontent <- readLines("derep_fasta/derep_anisakidae_18s.fasta")

seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))

mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)

for (i in 1:nrow(mSeqPos)){
        sequ <- ""
        for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
                sequ <- paste0(sequ, fcontent[j])
        }
        mSeqPos[i, "sequence"] <- sequ
}

seqDb <- mSeqPos %>% 
        as.data.frame() %>% 
        rownames_to_column('id') %>% 
        #filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
        mutate(id=gsub('cf. ', '', id)) %>%
        mutate(id=gsub('aff. ', '', id)) %>% 
        mutate(tmp=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
        mutate(id=gsub('>', '', word(tmp, 1))) %>% 
        mutate(genus=word(tmp, 2)) %>% 
        mutate(species=paste(word(tmp, 2), word(tmp,3))) %>%
        select(id, genus, species, sequence)

# perform MSA using ClustalW CLI: ./run_clustalw.sh derep_fasta.fa
# import alignment (in clustal format)
alignment <- read.alignment(file="derep_fasta/derep_anisakidae_18s.aln", format="clustal")
# calculate distance matrix for MSA
distMatrix <- dist.alignment(alignment, "similarity")
# Construct a phylo tree using neighbor joining algorithm
mytree <- nj(distMatrix) %>% 
        phytools::midpoint.root()

tipLabelDf <- data.frame(mytree$tip.label)
colnames(tipLabelDf) <- "id"

tipLabelDf <- full_join(tipLabelDf, seqDb, by="id") %>% 
        mutate(id=paste(id, species))

rownames(tipLabelDf) <- tipLabelDf[["id"]]

mytree$tip.label <- tipLabelDf[["id"]]

p <- ggtree(mytree, options(ignore.negative.edge=TRUE)) #+
        # geom_tiplab(size=2.5) +
        # ggexpand(ratio=0.5, side="h")

p %<+% tipLabelDf +
        geom_tiplab(size=2.5) +
        theme_tree() +
        geom_tippoint(aes(color=genus), size=2) +
        ggexpand(ratio=0.4, side="h")

Dereplicate anisakidae COX1-5P

# read in fasta file
fcontent <- readLines("derep_fasta/derep_anisakidae_coi5p.fasta")

seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))

mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)

for (i in 1:nrow(mSeqPos)){
        sequ <- ""
        for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
                sequ <- paste0(sequ, fcontent[j])
        }
        mSeqPos[i, "sequence"] <- sequ
}

seqDb <- mSeqPos %>% 
        as.data.frame() %>% 
        rownames_to_column('id') %>% 
        #filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
        mutate(id=gsub('cf. ', '', id)) %>%
        mutate(id=gsub('aff. ', '', id)) %>% 
        mutate(tmp=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
        mutate(id=gsub('>', '', word(tmp, 1))) %>% 
        mutate(genus=word(tmp, 2)) %>% 
        mutate(species=paste(word(tmp, 2), word(tmp,3))) %>%
        select(id, genus, species, sequence)

# perform MSA using ClustalW CLI: ./run_clustalw.sh derep_fasta.fa
# import alignment (in clustal format)
alignment <- read.alignment(file="derep_fasta/derep_anisakidae_coi5p.aln", format="clustal")
# calculate distance matrix for MSA
distMatrix <- dist.alignment(alignment, "similarity")
# Construct a phylo tree using neighbor joining algorithm
mytree <- nj(distMatrix) %>% 
        phytools::midpoint.root()

tipLabelDf <- data.frame(mytree$tip.label)
colnames(tipLabelDf) <- "id"

tipLabelDf <- full_join(tipLabelDf, seqDb, by="id") %>% 
        mutate(id=paste(id, species))
rownames(tipLabelDf) <- tipLabelDf[["id"]]

mytree$tip.label <- tipLabelDf[["id"]]

p <- ggtree(mytree, options(ignore.negative.edge=TRUE)) #+
        # geom_tiplab(size=2.5) +
        # ggexpand(ratio=0.5, side="h")

p %<+% tipLabelDf +
        geom_tiplab(size=2.5) +
        theme_tree() +
        geom_tippoint(aes(color=genus), size=2) +
        ggexpand(ratio=0.2, side="h")

Dereplicate anisakidae ITS

# read in fasta file
fcontent <- readLines("derep_fasta/derep_anisakidae_its.fasta")

seqStart <- grep(">", fcontent)
seqEnd <- seqStart - 1
seqEnd <- seqEnd[-1]
seqEnd <- c(seqEnd, length(fcontent))

mSeqPos <- cbind(seqStart, seqEnd)
rownames(mSeqPos) <- fcontent[seqStart]
mSeqPos[, 1] <- mSeqPos[, 1] + 1
mSeqPos <- cbind(mSeqPos, sequence=NA)

for (i in 1:nrow(mSeqPos)){
        sequ <- ""
        for (j in mSeqPos[i, "seqStart"]:mSeqPos[i, "seqEnd"]){
                sequ <- paste0(sequ, fcontent[j])
        }
        mSeqPos[i, "sequence"] <- sequ
}

seqDb <- mSeqPos %>% 
        as.data.frame() %>% 
        rownames_to_column('id') %>% 
        #filter(grepl('Anisakis|Pseudoterranova|Phocascaris|Contracaecum', id)) %>%
        mutate(id=gsub('cf. ', '', id)) %>%
        mutate(id=gsub('aff. ', '', id)) %>% 
        mutate(tmp=paste(gsub('>', '', word(id, 1)), word(id, 2), word(id, 3))) %>%
        mutate(id=gsub('>', '', word(tmp, 1))) %>% 
        mutate(genus=word(tmp, 2)) %>% 
        mutate(species=paste(word(tmp, 2), word(tmp,3))) %>%
        select(id, genus, species, sequence)

# perform MSA using ClustalW CLI: ./run_clustalw.sh derep_fasta.fa
# import alignment (in clustal format)
alignment <- read.alignment(file="derep_fasta/derep_anisakidae_its.aln", format="clustal")
# calculate distance matrix for MSA
distMatrix <- dist.alignment(alignment, "similarity")
# Construct a phylo tree using neighbor joining algorithm
mytree <- nj(distMatrix) %>% 
        phytools::midpoint.root()

tipLabelDf <- data.frame(mytree$tip.label)
colnames(tipLabelDf) <- "id"

tipLabelDf <- full_join(tipLabelDf, seqDb, by="id") %>% 
        mutate(id=paste(id, species))
rownames(tipLabelDf) <- tipLabelDf[["id"]]

mytree$tip.label <- tipLabelDf[["id"]]

p <- ggtree(mytree, options(ignore.negative.edge=TRUE)) #+
        # geom_tiplab(size=2.5) +
        # ggexpand(ratio=0.5, side="h")

p %<+% tipLabelDf +
        geom_tiplab(size=2.5) +
        theme_tree() +
        geom_tippoint(aes(color=genus), size=2) +
        ggexpand(ratio=0.2, side="h")

Session information

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_SG.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_SG.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_SG.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_SG.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] lubridate_1.9.2     forcats_1.0.0       stringr_1.5.0      
##  [4] dplyr_1.1.1         purrr_1.0.1         readr_2.1.4        
##  [7] tidyr_1.3.0         tibble_3.2.1        ggplot2_3.4.2      
## [10] tidyverse_2.0.0     ggtree_3.7.2        seqinr_4.2-30      
## [13] ape_5.7-1           msa_1.26.0          Biostrings_2.62.0  
## [16] GenomeInfoDb_1.30.1 XVector_0.34.0      IRanges_2.28.0     
## [19] S4Vectors_0.32.4    BiocGenerics_0.40.0